load packages

library(tidyverse)
library(glmnet)
library(caTools)
library(caret)
library(ROCR)

load coded data

coded = read.csv("~/Documents/code/dsnlab/automotion-test-set/tds_artifact_coded_volumes.csv")

load stripe detection data

fileDir = "~/Documents/code/dsnlab/automotion-test-set/output/"
filePattern = "tds_stripes_.*.csv"
  
file_list = list.files(fileDir, pattern = filePattern)

for (file in file_list){
  # if the merged dataset doesn't exist, create it
  if (!exists("stripes")){
    temp = read.csv(paste0(fileDir,file))
    stripes = data.frame(temp) %>% 
      rename("volume" = t,
             "subjectID" = subject) %>%
      select(-file)
    rm(temp)
  }
  
  # if the merged dataset does exist, append to it
  else {
    temp_dataset = read.csv(paste0(fileDir,file))
    temp_dataset = data.frame(temp_dataset) %>% 
      rename("volume" = t,
             "subjectID" = subject) %>%
      select(-file)
    stripes = rbind(stripes, temp_dataset)
    rm(temp_dataset)
  }
}

load global intensities and rps

# define paths and variables
rpDir = '~/Documents/code/dsnlab/automotion-test-set/rp_txt/'
outputDir = '~/Documents/code/tds_auto-motion/auto-motion-output/'
study = "tds2"
rpPattern = "^rp_([0-9]{3})_(.*).txt"
rpCols = c("euclidian_trans","euclidian_rot","euclidian_trans_deriv","euclidian_rot_deriv","trash.rp")

# global intensities
intensities = read.csv(paste0(outputDir,study,'_globalIntensities.csv'))

# edit volume numbers for subject 157, stop3
intensities = intensities %>% 
  mutate(volume = ifelse(subjectID == 157 & run == "stop3" & volume > 43, volume - 1, volume))

# rp files
file_list = list.files(rpDir, pattern = rpPattern)

for (file in file_list){
  # if the merged dataset doesn't exist, create it
  if (!exists("rp")){
    temp = read.table(paste0(rpDir,file))
    colnames(temp) = rpCols
    rp = data.frame(temp, file = rep(file,count(temp))) %>% 
      mutate(volume = row_number()) %>%
      extract(file,c("subjectID","run"), rpPattern) %>%
      mutate(subjectID = as.integer(subjectID))
    rm(temp)
  }
  
  # if the merged dataset does exist, append to it
  else {
    temp_dataset = read.table(paste0(rpDir,file))
    colnames(temp_dataset) = rpCols
    temp_dataset = data.frame(temp_dataset, file = rep(file,count(temp_dataset))) %>% 
      mutate(volume = row_number()) %>%
      extract(file,c("subjectID","run"), rpPattern) %>%
      mutate(subjectID = as.integer(subjectID))
    rp = rbind(rp, temp_dataset)
    rm(temp_dataset)
  }
}

join dataframes

joined = left_join(stripes, coded, by = c("subjectID", "run", "volume")) %>%
  left_join(., intensities, by = c("subjectID", "run", "volume")) %>%
  left_join(., rp, by = c("subjectID", "run", "volume")) %>%
  mutate(striping = ifelse(is.na(striping), 0, striping),
         intensity = ifelse(is.na(intensity), 0, intensity),
         tile = paste0("tile_",tile),
         artifact = ifelse(striping > 1, 1, 0)) %>%
  group_by(subjectID, run, tile) %>%
  mutate(Diff.mean = volMean - lag(volMean),
         Diff.sd = volSD - lag(volSD)) %>%
  spread(tile, freqtile_power)

split the data

set.seed(101) 
sample = sample.split(joined$artifact, SplitRatio = .75)
training = subset(joined, sample == TRUE)
testing = subset(joined, sample == FALSE)

machine learning

tidy data

train.ml = training %>%
  group_by(subjectID, run) %>%
  mutate(Diff.mean = ifelse(is.na(Diff.mean),0,Diff.mean),
         Diff.sd = ifelse(is.na(Diff.sd),0,Diff.sd)) %>%
  gather(tile,freqtile_power, starts_with("tile")) %>%
  mutate(tile = paste0(tile,"_c")) %>%
  group_by(subjectID, run, tile) %>%
  mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
  ungroup() %>%
  select(-freqtile_power) %>%
  spread(tile,freqtile_power_c) %>%
  select(-striping, - intensity, -trash.rp, -fsl.volume, -volMean, -volSD, -euclidian_rot, -euclidian_trans) %>%
  select(subjectID, run, volume, artifact, everything())

test.ml = testing %>%
  group_by(subjectID, run) %>%
  mutate(Diff.mean = ifelse(is.na(Diff.mean),0,Diff.mean),
         Diff.sd = ifelse(is.na(Diff.sd),0,Diff.sd)) %>%
  gather(tile,freqtile_power, starts_with("tile")) %>%
  mutate(tile = paste0(tile,"_c")) %>%
  group_by(subjectID, run, tile) %>%
  mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
  ungroup() %>%
  select(-freqtile_power) %>%
  spread(tile,freqtile_power_c) %>%
  select(-striping, - intensity, -trash.rp, -fsl.volume, -volMean, -volSD, -euclidian_rot, -euclidian_trans) %>%
  select(subjectID, run, volume, artifact, everything())

use lasso logistic regression to fit beta weights for each predictor

training

# subset predictors and criterion
x_train = as.matrix(train.ml[,-c(1,2,3,4)])
y_train = as.double(as.matrix(train.ml[, 4]))

# run xval to determine lambda
cv.train <- cv.glmnet(x_train, y_train, family='binomial', alpha=1, parallel=TRUE, standardize=TRUE, type.measure='auc')

plot(cv.train)

plot(cv.train$glmnet.fit, xvar="lambda", label=TRUE)

cv.train$lambda.min
## [1] 0.0001055024
cv.train$lambda.1se
## [1] 0.01331289
coef(cv.train, s=cv.train$lambda.min)
## 16 x 1 sparse Matrix of class "dgCMatrix"
##                                    1
## (Intercept)              -5.34966479
## euclidian_trans_deriv    -1.96821531
## euclidian_rot_deriv       1.25812915
## Diff.mean                 0.01511855
## Diff.sd                   0.03814063
## tile_1_c                -68.96785531
## tile_10_c              7750.81268518
## tile_11_c             -5214.91044479
## tile_2_c               -349.90077443
## tile_3_c                  .         
## tile_4_c              -1015.84159332
## tile_5_c              -1210.50826519
## tile_6_c                176.50824285
## tile_7_c               1794.76700119
## tile_8_c               3217.08847243
## tile_9_c               -811.96646874
coef(cv.train, s=cv.train$lambda.1se)
## 16 x 1 sparse Matrix of class "dgCMatrix"
##                                   1
## (Intercept)             -4.24641815
## euclidian_trans_deriv    .         
## euclidian_rot_deriv      .         
## Diff.mean                .         
## Diff.sd                  0.01718502
## tile_1_c               -17.31491278
## tile_10_c             3019.45382532
## tile_11_c                .         
## tile_2_c                 .         
## tile_3_c                 .         
## tile_4_c                 .         
## tile_5_c                 .         
## tile_6_c                 .         
## tile_7_c                 .         
## tile_8_c                 .         
## tile_9_c                 .
# test on sample
pred_train = predict(cv.train, newx = x_train, s=cv.train$lambda.1se, type="response")

# plot cutoff v. accuracy
predicted = prediction(pred_train, y_train, label.ordering = NULL)
perf = performance(predicted, measure = "acc")
perf.df = data.frame(cut=perf@x.values[[1]],acc=perf@y.values[[1]])

ggplot(perf.df, aes(cut, acc)) +
  geom_line()

# plot false v. true positive rate
perf = performance(predicted, measure = "tpr", x.measure = "fpr")
perf.df = data.frame(cut=perf@alpha.values[[1]],fpr=perf@x.values[[1]],tpr=perf@y.values[[1]])

ggplot(perf.df, aes(fpr, tpr)) +
  geom_line()

# plot specificity v. sensitivity
perf = performance(predicted, measure = "sens", x.measure = "spec")
perf.df = data.frame(cut=perf@alpha.values[[1]],sens=perf@x.values[[1]],spec=perf@y.values[[1]])
ggplot(perf.df, aes(spec, sens)) +
  geom_line()

ggplot(perf.df, aes(x = cut)) +
  geom_line(aes(y = sens, color = "sensitivity")) + 
  geom_line(aes(y = spec, color = "specificity"))

perf@alpha.values[[1]][which.max(perf@x.values[[1]]+perf@y.values[[1]])] # cut
## [1] 0.02640321
max(perf@x.values[[1]]+perf@y.values[[1]]) # sensitivity + specificity
## [1] 1.738462
# confusion matrix
pred_train = predict(cv.train, newx = x_train, s=cv.train$lambda.1se, type="response")
pred_train[pred_train > .03] = 1
pred_train[pred_train < .03] = 0
confusionMatrix(pred_train, y_train)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5530   34
##          1  127   85
##                                             
##                Accuracy : 0.9721            
##                  95% CI : (0.9675, 0.9762)  
##     No Information Rate : 0.9794            
##     P-Value [Acc > NIR] : 0.9999            
##                                             
##                   Kappa : 0.5004            
##  Mcnemar's Test P-Value : 0.0000000000004149
##                                             
##             Sensitivity : 0.9775            
##             Specificity : 0.7143            
##          Pos Pred Value : 0.9939            
##          Neg Pred Value : 0.4009            
##              Prevalence : 0.9794            
##          Detection Rate : 0.9574            
##    Detection Prevalence : 0.9633            
##       Balanced Accuracy : 0.8459            
##                                             
##        'Positive' Class : 0                 
## 

testing

# subset predictors and criterion
x_test = as.matrix(test.ml[,-c(1,2,3,4)])
y_test = as.double(as.matrix(test.ml[, 4]))

# test on holdout sample
pred_test = predict(cv.train, newx = x_test, s=cv.train$lambda.1se, type="response")
pred_test[pred_test > .03] = 1
pred_test[pred_test < .03] = 0

# confusion matrix
confusionMatrix(pred_test, y_test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1846    7
##          1   40   33
##                                          
##                Accuracy : 0.9756         
##                  95% CI : (0.9677, 0.982)
##     No Information Rate : 0.9792         
##     P-Value [Acc > NIR] : 0.8828         
##                                          
##                   Kappa : 0.5726         
##  Mcnemar's Test P-Value : 0.000003046    
##                                          
##             Sensitivity : 0.9788         
##             Specificity : 0.8250         
##          Pos Pred Value : 0.9962         
##          Neg Pred Value : 0.4521         
##              Prevalence : 0.9792         
##          Detection Rate : 0.9585         
##    Detection Prevalence : 0.9621         
##       Balanced Accuracy : 0.9019         
##                                          
##        'Positive' Class : 0              
## 

svm

training

train.svm = train.ml[,-c(1,2,3)] %>%
  mutate(artifact = ifelse(artifact == 1, "yes","no"),
         artifact = as.factor(artifact))

# specify control parameters
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 3, classProbs = TRUE)

# run initial model
set.seed(101)
svmFit = train(artifact ~ ., 
               data = train.svm, 
               method = "svmLinear", 
               trControl = fitControl,
               preProcess = c("center", "scale"),
               tuneLength = 10,
               metric = "ROC",
               verbose = FALSE)
svmFit$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 157 
## 
## Objective Function Value : -150.248 
## Training error : 0.010215 
## Probability model included.
# predict model
train_pred = predict(svmFit, newdata = train.svm, type="prob") %>%
  select(-no)

# plot cutoff v. accuracy
predicted = prediction(train_pred, train.svm$artifact, label.ordering = NULL)
perf = performance(predicted, measure = "acc")
perf.df = data.frame(cut=perf@x.values[[1]],acc=perf@y.values[[1]])

ggplot(perf.df, aes(cut, acc)) +
  geom_line()

# plot false v. true positive rate
perf = performance(predicted, measure = "tpr", x.measure = "fpr")
perf.df = data.frame(cut=perf@alpha.values[[1]],fpr=perf@x.values[[1]],tpr=perf@y.values[[1]])

ggplot(perf.df, aes(fpr, tpr)) +
  geom_line()

# plot specificity v. sensitivity
perf = performance(predicted, measure = "sens", x.measure = "spec")
perf.df = data.frame(cut=perf@alpha.values[[1]],sens=perf@x.values[[1]],spec=perf@y.values[[1]])
ggplot(perf.df, aes(spec, sens)) +
  geom_line()

ggplot(perf.df, aes(x = cut)) +
  geom_line(aes(y = sens, color = "sensitivity")) + 
  geom_line(aes(y = spec, color = "specificity"))

perf@alpha.values[[1]][which.max(perf@x.values[[1]]+perf@y.values[[1]])] # cut
## [1] 0.02344554
max(perf@x.values[[1]]+perf@y.values[[1]]) # sensitivity + specificity
## [1] 1.743969
# cut and assess accuracy in training sample
train_pred = predict(svmFit, newdata = train.svm, type="prob") %>%
  select(-no)
train_pred =  as.matrix(train_pred)
train_pred[train_pred > .07] = "yes"
train_pred[train_pred < .07] = "no"
confusionMatrix(train_pred, train.svm$artifact)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  5612   36
##        yes   45   83
##                                           
##                Accuracy : 0.986           
##                  95% CI : (0.9826, 0.9888)
##     No Information Rate : 0.9794          
##     P-Value [Acc > NIR] : 0.0001236       
##                                           
##                   Kappa : 0.6649          
##  Mcnemar's Test P-Value : 0.3740628       
##                                           
##             Sensitivity : 0.9920          
##             Specificity : 0.6975          
##          Pos Pred Value : 0.9936          
##          Neg Pred Value : 0.6484          
##              Prevalence : 0.9794          
##          Detection Rate : 0.9716          
##    Detection Prevalence : 0.9778          
##       Balanced Accuracy : 0.8448          
##                                           
##        'Positive' Class : no              
## 

testing

test.svm = test.ml[,-c(1,2,3)] %>%
  mutate(artifact = ifelse(artifact == 1, "yes","no"),
         artifact = as.factor(artifact))

# cut and assess accuracy in test sample
test_pred = predict(svmFit, newdata = test.svm, type="prob") %>%
  select(-no)
test_pred =  as.matrix(test_pred)
test_pred[test_pred > .07] = "yes"
test_pred[test_pred < .07] = "no"
confusionMatrix(test_pred, test.svm$artifact)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1867   11
##        yes   19   29
##                                           
##                Accuracy : 0.9844          
##                  95% CI : (0.9778, 0.9895)
##     No Information Rate : 0.9792          
##     P-Value [Acc > NIR] : 0.05977         
##                                           
##                   Kappa : 0.6512          
##  Mcnemar's Test P-Value : 0.20124         
##                                           
##             Sensitivity : 0.9899          
##             Specificity : 0.7250          
##          Pos Pred Value : 0.9941          
##          Neg Pred Value : 0.6042          
##              Prevalence : 0.9792          
##          Detection Rate : 0.9694          
##    Detection Prevalence : 0.9751          
##       Balanced Accuracy : 0.8575          
##                                           
##        'Positive' Class : no              
## 

save models

setwd("~/Documents/code/dsnlab/automotion-test-set")
saveRDS(cv.train, "model_log_TDS.rds")
saveRDS(svmFit, "model_svm_TDS.rds")

weighted model

# # create model weights (they sum to one)
# model_weights = ifelse(train.svm$artifact == "yes",
#                         (1/table(train.svm$artifact)[1]) * 0.5,
#                         (1/table(train.svm$artifact)[2]) * 0.5)
# 
# # use the same seed to ensure same cross-validation splits
# fitControl$seeds = svmFit$control$seeds
# 
# svmFit_weighted = train(artifact ~ .,
#                data = train.svm,
#                method = "svmLinear",
#                trControl = fitControl,
#                preProcess = c("center", "scale"),
#                tuneLength = 10,
#                metric = "ROC",
#                verbose = FALSE,
#                weights = model_weights)
# 
# svmFit_weighted$finalModel
# 
# test_pred_weighted = predict(svmFit_weighted, newdata = test.svm, type="prob") %>%
#   select(-no)
# test_pred_weighted =  as.matrix(test_pred_weighted)
# test_pred_weighted[test_pred_weighted > .07] = "yes"
# test_pred_weighted[test_pred_weighted < .07] = "no"
# confusionMatrix(test_pred_weighted, test.svm$artifact)

# # determine best cost parameter
# grid <- expand.grid(C = c(0, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 5))
# set.seed(3233)
# svm_Linear_Grid <- train(artifact ~ ., 
#                          data = train.svm, method = "svmLinear",
#                          trControl=fitControl,
#                          preProcess = c("center", "scale"),
#                          tuneGrid = grid,
#                          tuneLength = 10)
# svm_Linear_Grid
# plot(svm_Linear_Grid)
# test_pred_grid = predict(svm_Linear_Grid, newdata = test.svm)
# confusionMatrix(test_pred_grid, test.svm$artifact)

auto-motion process

training

train.man = training %>%
  gather(tile, freqtile_power, starts_with("tile")) %>%
  filter(tile %in% c("tile_1", "tile_10")) %>%
  
  # code trash based on mean, sd, and rp
  ungroup %>%
  mutate(meanDiff.mean = mean(Diff.mean, na.rm=TRUE),
         sdDiff.mean = sd(Diff.mean, na.rm=TRUE),
         meanDiff.sd = mean(Diff.sd, na.rm=TRUE),
         sdDiff.sd = sd(Diff.sd, na.rm=TRUE),
         
         # code volumes above mean thresholds as trash
         upper.mean = meanDiff.mean + 2*sdDiff.mean,
         lower.mean = meanDiff.mean - 2*sdDiff.mean,
         trash.mean = ifelse(Diff.mean > upper.mean | Diff.mean < lower.mean, 1, 0),
         
         upper.sd = meanDiff.sd + 2*sdDiff.sd,
         lower.sd = meanDiff.sd - 2*sdDiff.sd,
         trash.sd = ifelse(Diff.sd > upper.sd | Diff.sd < lower.sd, 1, 0),
         
         # code volumes with more than +/- .25mm translation or rotation in Euclidian distance
         trash.rp.tr = ifelse(euclidian_trans_deriv > .25 | euclidian_trans_deriv < -.25, 1, 0),
         trash.rp.rot = ifelse(euclidian_rot_deriv > .25 | euclidian_rot_deriv < -.25, 1, 0)) %>%
  select(-meanDiff.mean, -meanDiff.sd, -sdDiff.mean, -sdDiff.sd) %>%
  
  # code trash based on striping
  group_by(subjectID, run, tile) %>%
  mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
  ungroup() %>%
  select(-freqtile_power) %>%
  spread(tile,freqtile_power_c) %>%
  mutate(trash.stripe = ifelse(tile_1 < -.035 & tile_10 > .00025, 1, 0)) %>%
  
  # combine trash
  mutate(trash.combined = ifelse(trash.stripe == 1, 1, 0),
         trash.combined = ifelse((trash.rp.tr + trash.rp.rot + trash.mean + trash.sd) > 1, 1, trash.combined)) %>%

  # recode as trash if volume behind and in front are both marked as trash
  mutate(trash.combined = ifelse(trash.combined == 0 & lag(trash.combined) == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
         
  # code first volume as trash if second volume is trash
  mutate(trash.combined = ifelse(volume == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%

  # code hits
  mutate(hits = ifelse(trash.combined == 1 & (artifact == 1), "hit",
                ifelse(trash.combined == 0 & (artifact == 1), "neg",
                ifelse(trash.combined == 1 & (artifact == 0), "pos", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits)) %>%
  gather(tile, freqtile_power_c, c("tile_1", "tile_10"))

testing

test.man = testing %>%
  gather(tile, freqtile_power, starts_with("tile")) %>%
  filter(tile %in% c("tile_1", "tile_10")) %>%
  
  # code trash based on mean, sd, and rp 
  ungroup %>%
  mutate(meanDiff.mean = mean(Diff.mean, na.rm=TRUE),
         sdDiff.mean = sd(Diff.mean, na.rm=TRUE),
         meanDiff.sd = mean(Diff.sd, na.rm=TRUE),
         sdDiff.sd = sd(Diff.sd, na.rm=TRUE),
         
         # code volumes above mean thresholds as trash
         upper.mean = meanDiff.mean + 2*sdDiff.mean,
         lower.mean = meanDiff.mean - 2*sdDiff.mean,
         trash.mean = ifelse(Diff.mean > upper.mean | Diff.mean < lower.mean, 1, 0),
         
         upper.sd = meanDiff.sd + 2*sdDiff.sd,
         lower.sd = meanDiff.sd - 2*sdDiff.sd,
         trash.sd = ifelse(Diff.sd > upper.sd | Diff.sd < lower.sd, 1, 0),
         
         # code volumes with more than +/- .25mm translation or rotation in Euclidian distance
         trash.rp.tr = ifelse(euclidian_trans_deriv > .25 | euclidian_trans_deriv < -.25, 1, 0),
         trash.rp.rot = ifelse(euclidian_rot_deriv > .25 | euclidian_rot_deriv < -.25, 1, 0)) %>%
  select(-meanDiff.mean, -meanDiff.sd, -sdDiff.mean, -sdDiff.sd) %>%
  
  # code trash based on striping
  group_by(subjectID, run, tile) %>%
  mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
  ungroup() %>%
  select(-freqtile_power) %>%
  spread(tile,freqtile_power_c) %>%
  mutate(trash.stripe = ifelse(tile_1 < -.035 & tile_10 > .00025, 1, 0)) %>%
  
  # combine trash
  mutate(trash.combined = ifelse(trash.stripe == 1, 1, 0),
         trash.combined = ifelse((trash.rp.tr + trash.rp.rot + trash.mean + trash.sd) > 1, 1, trash.combined)) %>%

  # recode as trash if volume behind and in front are both marked as trash
  mutate(trash.combined = ifelse(trash.combined == 0 & lag(trash.combined) == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
         
  # code first volume as trash if second volume is trash
  mutate(trash.combined = ifelse(volume == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%

  # code hits
  mutate(hits = ifelse(trash.combined == 1 & (artifact == 1), "hit",
                ifelse(trash.combined == 0 & (artifact == 1), "neg",
                ifelse(trash.combined == 1 & (artifact == 0), "pos", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits)) %>%
  gather(tile, freqtile_power_c, c("tile_1", "tile_10"))

compare hit rates

training data

# select only one set of observations and code correct rejections
train.tab = train.man %>% 
  filter(tile == "tile_1") %>%
  mutate(hits.tot = ifelse(hits %in% "hit", "hit",
                    ifelse(hits %in% "neg", "neg",
                    ifelse(hits %in% "pos", "pos",
                    ifelse(is.na(hits), "cor.rej", hits)))))

lasso logistic regression

confusionMatrix(pred_train, y_train)$table
##           Reference
## Prediction    0    1
##          0 5530   34
##          1  127   85
confusionMatrix(pred_train, y_train)$overall[1]
## Accuracy 
## 0.972126
confusionMatrix(pred_train, y_train)$byClass[11]
## Balanced Accuracy 
##         0.8459178

svm

confusionMatrix(train_pred, train.svm$artifact)$table
##           Reference
## Prediction   no  yes
##        no  5612   36
##        yes   45   83
confusionMatrix(train_pred, train.svm$artifact)$overall[1]
##  Accuracy 
## 0.9859765
confusionMatrix(train_pred, train.svm$artifact)$byClass[11]
## Balanced Accuracy 
##         0.8447621

manual

table(train.tab$hits.tot)
## 
## cor.rej     hit     neg     pos 
##    5609      90      28      49
cor.rej = table(train.tab$hits.tot)[[1]]
hit = table(train.tab$hits.tot)[[2]]
neg = table(train.tab$hits.tot)[[3]]
pos = table(train.tab$hits.tot)[[4]]
total = cor.rej + hit + neg + pos

sprintf('Accuracy: %s', round((cor.rej + hit) / total,2))
## [1] "Accuracy: 0.99"
# balanced accuracy
sprintf('Balanced accuracy: %s', round(((cor.rej / (cor.rej + pos)) + (hit / (hit + neg))) / 2,2))
## [1] "Balanced accuracy: 0.88"

test data

test.tab = test.man %>% 
  filter(tile == "tile_1") %>%
  mutate(hits.tot = ifelse(hits %in% "hit", "hit",
                    ifelse(hits %in% "neg", "neg",
                    ifelse(hits %in% "pos", "pos",
                    ifelse(is.na(hits), "cor.rej", NA)))))

lasso logistic regression

confusionMatrix(pred_test, y_test)$table
##           Reference
## Prediction    0    1
##          0 1846    7
##          1   40   33
confusionMatrix(pred_test, y_test)$overall[1]
##  Accuracy 
## 0.9755971
confusionMatrix(pred_test, y_test)$byClass[11]
## Balanced Accuracy 
##         0.9018955

svm

confusionMatrix(test_pred, test.svm$artifact)$table
##           Reference
## Prediction   no  yes
##        no  1867   11
##        yes   19   29
confusionMatrix(test_pred, test.svm$artifact)$overall[1]
##  Accuracy 
## 0.9844237
confusionMatrix(test_pred, test.svm$artifact)$byClass[11]
## Balanced Accuracy 
##         0.8574629

manual

table(test.tab$hits.tot)
## 
## cor.rej     hit     neg     pos 
##    1874      35       4      13
cor.rej = table(test.tab$hits.tot)[[1]]
hit = table(test.tab$hits.tot)[[2]]
neg = table(test.tab$hits.tot)[[3]]
pos = table(test.tab$hits.tot)[[4]]
total = cor.rej + hit + neg + pos

sprintf('Accuracy: %s', round((cor.rej + hit) / total,2))
## [1] "Accuracy: 0.99"
# balanced accuracy
sprintf('Balanced accuracy: %s', round(((cor.rej / (cor.rej + pos)) + (hit / (hit + neg))) / 2,2))
## [1] "Balanced accuracy: 0.95"

plot composite

combine training and test data

full_man = bind_rows(train.man, test.man)

plot machine learning

logistic regression

full_log = bind_rows(train.ml,test.ml)

# re-run lasso logistic regression on full sample
x = as.matrix(full_log[,-c(1,2,3,4)])
y = as.double(as.matrix(full_log[, 4]))
pred = predict(cv.train, newx = x, s=cv.train$lambda.1se, type="response")
pred[pred > .03] = 1
pred[pred < .03] = 0
confusionMatrix(pred, y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7376   41
##          1  167  118
##                                              
##                Accuracy : 0.973              
##                  95% CI : (0.9691, 0.9765)   
##     No Information Rate : 0.9794             
##     P-Value [Acc > NIR] : 0.9999             
##                                              
##                   Kappa : 0.5188             
##  Mcnemar's Test P-Value : <0.0000000000000002
##                                              
##             Sensitivity : 0.9779             
##             Specificity : 0.7421             
##          Pos Pred Value : 0.9945             
##          Neg Pred Value : 0.4140             
##              Prevalence : 0.9794             
##          Detection Rate : 0.9577             
##    Detection Prevalence : 0.9630             
##       Balanced Accuracy : 0.8600             
##                                              
##        'Positive' Class : 0                  
## 

svm

full_svm = bind_rows(train.ml,test.ml)  %>%
  mutate(artifact = ifelse(artifact == 1, "yes","no"),
         artifact = as.factor(artifact))

# re-run svm on full sample
full_pred = predict(svmFit, newdata = full_svm, type="prob") %>%
  select(-no)
full_pred =  as.matrix(full_pred)
full_pred[full_pred > .07] = "yes"
full_pred[full_pred < .07] = "no"
confusionMatrix(full_pred, full_svm$artifact)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7479   47
##        yes   64  112
##                                           
##                Accuracy : 0.9856          
##                  95% CI : (0.9827, 0.9881)
##     No Information Rate : 0.9794          
##     P-Value [Acc > NIR] : 0.00003083      
##                                           
##                   Kappa : 0.6613          
##  Mcnemar's Test P-Value : 0.1288          
##                                           
##             Sensitivity : 0.9915          
##             Specificity : 0.7044          
##          Pos Pred Value : 0.9938          
##          Neg Pred Value : 0.6364          
##              Prevalence : 0.9794          
##          Detection Rate : 0.9710          
##    Detection Prevalence : 0.9771          
##       Balanced Accuracy : 0.8480          
##                                           
##        'Positive' Class : no              
## 

manual

full_man1 = full_man %>%
  filter(tile == "tile_1")
confusionMatrix(full_man1$trash.combined, full_man1$artifact)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7386   32
##          1   62  125
##                                         
##                Accuracy : 0.9876        
##                  95% CI : (0.9849, 0.99)
##     No Information Rate : 0.9794        
##     P-Value [Acc > NIR] : 0.00000002905 
##                                         
##                   Kappa : 0.7205        
##  Mcnemar's Test P-Value : 0.00278       
##                                         
##             Sensitivity : 0.9917        
##             Specificity : 0.7962        
##          Pos Pred Value : 0.9957        
##          Neg Pred Value : 0.6684        
##              Prevalence : 0.9794        
##          Detection Rate : 0.9712        
##    Detection Prevalence : 0.9754        
##       Balanced Accuracy : 0.8939        
##                                         
##        'Positive' Class : 0             
## 

plot and compare models

data.plot.log = bind_cols(full_log, as.data.frame(y), as.data.frame(pred)) %>%
  mutate(hits = ifelse(y == 1 & `1` == 1, "hit",
                ifelse(y == 0 & `1` == 1, "pos",
                ifelse(y == 1 & `1` == 0, "neg", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits))

data.plot.svm = bind_cols(full_svm, as.data.frame(full_pred)) %>%
  rename("full_pred" = yes) %>%
  mutate(hits = ifelse(artifact == "yes" & full_pred == "yes", "hit",
                ifelse(artifact == "no" & full_pred == "yes", "pos",
                ifelse(artifact == "yes" & full_pred == "no", "neg", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits))

plot.comp = full_man %>%
  filter(tile == "tile_1") %>%
  select(subjectID, run, volume, euclidian_trans_deriv, hits) %>%
  rename("auto" = hits) %>%
  left_join(data.plot.log, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
  select(subjectID, run, volume, euclidian_trans_deriv, auto, hits) %>%
  rename("log" = hits) %>%
  left_join(data.plot.svm, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
  select(subjectID, run, volume, euclidian_trans_deriv, auto, log, hits) %>%
  rename("svm" = hits) %>%
  gather(model, hits, c("auto", "log", "svm")) %>%
  mutate(label = ifelse(regexpr('.*', hits), as.character(volume), ''))

ggplot(plot.comp, aes(x = volume, y = euclidian_trans_deriv)) +
  geom_line(size = .25) +
  geom_point(data = subset(plot.comp, !is.na(hits)), aes(color = hits), size = 2.5) +
  geom_text(aes(label = label), size = 1.5) +
  facet_wrap(~ subjectID + run + model, ncol = 9, scales = "free") +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A","#F21A00")) +
  theme(axis.text.x = element_text(size = 6))